/*
 * Copyright (c) 2023 Institute of Parallel And Distributed Systems (IPADS), Shanghai Jiao Tong University (SJTU)
 * Licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *     http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
 * PURPOSE.
 * See the Mulan PSL v2 for more details.
 */

#include <common/asm.h>

/*
 * void bzero(void *p, size_t size)
 *
 *  x0 - p
 *  x1 - size
 */

BEGIN_FUNC(bzero)
	cbz	x1, ending

	/*
	 * x5 is number of cache lines to zero - calculated later and
	 * will become non-zero if  buffer is long enough to zero by
	 * cache lines (and if it is allowed.)
	 * We need to zero it before proceeding with buffers of size
	 * smaller than 16 bytes - otherwise the x5 will not be
	 * calculated and will retain random value.
	 * "normal" is used for buffers <= 16 bytes and to align buffer
	 * to cache line for buffers bigger than cache line; non-0 x5
	 * after "normal" has completed indicates that it has been used
	 * to align buffer to cache line and now zero by cache lines will
	 * be performed, and x5 is amount of cache lines to loop through.
	 */
	mov	x5, xzr

	/* No use of cache assisted zero for buffers with size <= 16 */
	cmp	x1, #0x10
	b.le	normal

	/*
	 * Load size of line that will be cleaned by dc zva call.
	 * 0 means that the instruction is not allowed
	 */
	ldr	x7, =dczva_line_size
	ldr	x7, [x7]
	cbz	x7, normal

	/*
	 * Buffer must be larger than cache line for using cache zeroing
	 * (and cache line aligned but this is checked after jump)
	 */
	cmp	x1, x7
	b.lt	normal

	/*
	 * Calculate number of bytes to cache aligned address (x4) nad
	 * number of full cache lines (x5). x6 is final address to zero.
	 */
	sub	x2, x7, #0x01
	mov	x3, -1
	eor	x3, x3, x2
	add	x4, x0, x2
	and	x4, x4, x3
	subs	x4, x4, x0
	b.eq	normal

	/* Calculate number of "lines" in buffer */
	sub	x5, x1, x4
	rbit	x2, x7
	clz	x2, x2
	lsr	x5, x5, x2

	/*
	 * If number of cache lines is 0, we will not be able to zero
	 * by cache lines, so go normal way.
	 */
	cbz	x5, normal
	/* x6 is final address to zero */
	add	x6, x0, x1

	/*
	 * We are here because x5 is non-0 so normal will be used to
	 * align buffer before cache zeroing. x4 holds number of bytes
	 * needed for alignment.
	 */
	mov	x1, x4

	/* When jumping here: x0 holds pointer, x1 holds size */
normal:
	/*
	 * Get buffer offset into 16 byte aligned address; 0 means pointer
	 * is aligned.
	 */
	ands	x2, x0, #0x0f
	b.eq	aligned_to_16
	/* Calculate one-byte loop runs to 8 byte aligned address. */
	ands	x2, x2, #0x07
	mov	x3, #0x08
	sub	x2, x3, x2
	/* x2 is number of bytes missing for alignment, x1 is buffer size */
	cmp	x1, x2
	csel	x2, x1, x2, le
	sub	x1, x1, x2

	/*
	 * Byte by byte copy will copy at least enough bytes to align
	 * pointer and at most "size".
	 */
align:
	strb	wzr, [x0], #0x01
	subs	x2, x2, #0x01
	b.ne	align

	/* Now pointer is aligned to 8 bytes */
	cmp	x1, #0x10
	b.lt	lead_out
	/* 
	 * Check if copy of another 8 bytes is needed to align to 16 byte
	 * address and do it
	 */
	tbz	x0, #0x03, aligned_to_16
	str	xzr, [x0], #0x08
	sub	x1, x1, #0x08

	/* While jumping here: x0 is 16 byte alligned address, x1 is size */
aligned_to_16:
	/* If size is less than 16 bytes, use lead_out to copy what remains */
	cmp	x1, #0x10
	b.lt	lead_out

	lsr	x2, x1, #0x04
zero_by_16:
	stp	xzr, xzr, [x0], #0x10
	subs	x2, x2, #0x01
	b.ne	zero_by_16

	/*
	 * Lead out requires addresses to be aligned to 8 bytes. It is used to
	 * zero buffers with sizes < 16 and what can not be zeroed by
	 * zero_by_16 loop.
	 */
	ands	x1, x1, #0x0f
	b.eq	lead_out_end
lead_out:
	tbz	x1, #0x03, lead_out_dword
	str	xzr, [x0], #0x08
lead_out_dword:
	tbz	x1, #0x02, lead_out_word
	str	wzr, [x0], #0x04
lead_out_word:
	tbz	x1, #0x01, lead_out_byte
	strh	wzr, [x0], #0x02
lead_out_byte:
	tbz	x1, #0x00, lead_out_end
	strb	wzr, [x0], #0x01

lead_out_end:
	/*
	 * If x5 is non-zero, this means that normal has been used as
	 * a lead in to align buffer address to cache size
	 */
	cbz	x5, ending

	/*
	 * Here x5 holds number of lines to zero; x6 is final address of
	 * buffer. x0 is cache line aligned pointer. x7 is cache line size
	 * in bytes
	 */
cache_line_zero:
	dc	zva, x0
	add	x0, x0, x7
	subs	x5, x5, #0x01
	b.ne	cache_line_zero

	/* Need to zero remaining bytes? */
	subs	x1, x6, x0
	b.ne	normal

ending:
	ret
END_FUNC(bzero)
